All packages used to for analysis and figures in this page:

library(tidyverse)
library(plotly)
library(DT)
library(glmnet)
library(caret)
library(colorRamps)
library(RColorBrewer)
library(colorspace)
library(readxl)

Load in the prepared data:

load("../Prepared_Data.RData")

I’ll split the data into 75% training and 25% test. I’ll use the same rows for the original tau-PET ROI data as well as the PCA-transformed data for comparison.

# Set seed for consistency in random sampling for 10-foldcross-validation
set.seed(127)
train.index <- sample(nrow(annual.changes), nrow(annual.changes)*0.75, replace=F)

# Remove unneccessary identifier info from datasets for modeling
original <- annual.changes %>% ungroup() %>% select(-RID, -interval_num)
pca <- post.pca %>% ungroup() %>% select(-RID, -interval_num)

# Pre-processing will be applied in model training with caret

# Subset training + test data for original (ROI) data + PC score data
original.train <- original[train.index, ]
original.test <- original[-train.index, ]
pca.train <- pca[train.index, ]
pca.test <- pca[-train.index, ]

Individual models

Elastic Net Regression

The first model I will build is an elastic net regression model, implemented using glmnet and caret. I like this model as it is very interpretable in terms of variable coefficients, and the regularization parameter refines the variables to only those most important in predicting the outcome variable. I will use ten-fold cross-validation with caret to select the optimal value of alpha and lambda. The alpha parameter ranges from 0 to 1 and dictates the type of regularization to use, with a value of 0 indicating L2 regularization (ridge regression) and a value of 1 indicating L1 regularization (lasso regression). The lambda parameter dictates the magnitude of the penalty applied to coefficients kept in the model.

# Set seed for consistency in random sampling for cross-validation
set.seed(127)

# Use 10-fold cross-validation
myControl <- trainControl(method = "cv", number = 10)

# Train glmnet with caret to try three different alpha values
# and 100 different values for lambda ranging from 0.0001 to 1
regularized.model <- train(CDRSB~., data=original.train, 
                           tuneGrid = expand.grid(
                             alpha = seq(0,1,.2),
                             lambda = seq(0.0001, 1, length = 100)),
                           method = "glmnet",
                           preProcess = c("center", "scale"),
                           trControl = myControl
)

Click to view the (lengthy) regularized.model cross-validation results:

regularized.model
## glmnet 
## 
## 246 samples
##  33 predictor
## 
## Pre-processing: centered (33), scaled (33) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 222, 221, 222, 222, 221, 220, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda  RMSE       Rsquared    MAE      
##   0.0    0.0001  1.0091850  0.21964529  0.6780949
##   0.0    0.0102  1.0091850  0.21964529  0.6780949
##   0.0    0.0203  1.0095210  0.22000778  0.6780781
##   0.0    0.0304  1.0082955  0.22077533  0.6760098
##   0.0    0.0405  1.0034343  0.22124908  0.6684824
##   0.0    0.0506  1.0001182  0.22107889  0.6616317
##   0.0    0.0607  0.9975682  0.22059610  0.6565301
##   0.0    0.0708  0.9958323  0.21987936  0.6521918
##   0.0    0.0809  0.9946206  0.21894477  0.6484039
##   0.0    0.0910  0.9937015  0.21804508  0.6449718
##   0.0    0.1011  0.9931939  0.21703021  0.6421292
##   0.0    0.1112  0.9929397  0.21596961  0.6397903
##   0.0    0.1213  0.9928986  0.21489586  0.6376888
##   0.0    0.1314  0.9930322  0.21380060  0.6358787
##   0.0    0.1415  0.9933035  0.21271905  0.6344478
##   0.0    0.1516  0.9937084  0.21163615  0.6331903
##   0.0    0.1617  0.9942073  0.21055269  0.6320603
##   0.0    0.1718  0.9947496  0.20952807  0.6310555
##   0.0    0.1819  0.9953473  0.20856816  0.6301270
##   0.0    0.1920  0.9960300  0.20759532  0.6296770
##   0.0    0.2021  0.9967858  0.20659816  0.6294918
##   0.0    0.2122  0.9975596  0.20562280  0.6293361
##   0.0    0.2223  0.9983673  0.20465182  0.6292700
##   0.0    0.2324  0.9991772  0.20372388  0.6292946
##   0.0    0.2425  1.0000189  0.20280435  0.6295454
##   0.0    0.2526  1.0008715  0.20190497  0.6298307
##   0.0    0.2627  1.0017365  0.20103545  0.6301325
##   0.0    0.2728  1.0025960  0.20017783  0.6303945
##   0.0    0.2829  1.0034644  0.19935368  0.6306588
##   0.0    0.2930  1.0043386  0.19853976  0.6309003
##   0.0    0.3031  1.0052038  0.19775732  0.6311361
##   0.0    0.3132  1.0060599  0.19701170  0.6313992
##   0.0    0.3233  1.0069096  0.19627330  0.6317978
##   0.0    0.3334  1.0077569  0.19555476  0.6322013
##   0.0    0.3435  1.0085995  0.19486135  0.6326775
##   0.0    0.3536  1.0094309  0.19418158  0.6331392
##   0.0    0.3637  1.0102570  0.19351353  0.6335800
##   0.0    0.3738  1.0110691  0.19287502  0.6340335
##   0.0    0.3839  1.0118723  0.19225054  0.6344717
##   0.0    0.3940  1.0126642  0.19163547  0.6349299
##   0.0    0.4041  1.0134476  0.19104953  0.6353781
##   0.0    0.4142  1.0142299  0.19047261  0.6358233
##   0.0    0.4243  1.0149958  0.18991626  0.6362453
##   0.0    0.4344  1.0157566  0.18934997  0.6366524
##   0.0    0.4445  1.0165039  0.18880932  0.6370484
##   0.0    0.4546  1.0172416  0.18827781  0.6374504
##   0.0    0.4647  1.0179592  0.18777299  0.6378404
##   0.0    0.4748  1.0186695  0.18726221  0.6382165
##   0.0    0.4849  1.0193729  0.18676672  0.6385901
##   0.0    0.4950  1.0200657  0.18627475  0.6389760
##   0.0    0.5051  1.0207439  0.18580232  0.6393382
##   0.0    0.5152  1.0214144  0.18533274  0.6396899
##   0.0    0.5253  1.0220757  0.18487344  0.6400292
##   0.0    0.5354  1.0227272  0.18443190  0.6403660
##   0.0    0.5455  1.0233751  0.18399373  0.6406988
##   0.0    0.5556  1.0240101  0.18356842  0.6410150
##   0.0    0.5657  1.0246431  0.18313318  0.6413253
##   0.0    0.5758  1.0252728  0.18269203  0.6416292
##   0.0    0.5859  1.0258876  0.18226241  0.6419246
##   0.0    0.5960  1.0264921  0.18183314  0.6422142
##   0.0    0.6061  1.0270749  0.18143749  0.6424821
##   0.0    0.6162  1.0276459  0.18106143  0.6427471
##   0.0    0.6263  1.0282070  0.18067795  0.6430037
##   0.0    0.6364  1.0287677  0.18030041  0.6432545
##   0.0    0.6465  1.0293160  0.17993280  0.6435090
##   0.0    0.6566  1.0298632  0.17956440  0.6437581
##   0.0    0.6667  1.0303923  0.17922272  0.6440118
##   0.0    0.6768  1.0309189  0.17888206  0.6442765
##   0.0    0.6869  1.0314366  0.17853524  0.6445463
##   0.0    0.6970  1.0319536  0.17819280  0.6448110
##   0.0    0.7071  1.0324539  0.17785177  0.6450878
##   0.0    0.7172  1.0329503  0.17750758  0.6453744
##   0.0    0.7273  1.0334338  0.17718179  0.6456470
##   0.0    0.7374  1.0339073  0.17687099  0.6459137
##   0.0    0.7475  1.0343765  0.17655168  0.6462074
##   0.0    0.7576  1.0348412  0.17623182  0.6465013
##   0.0    0.7677  1.0353030  0.17591717  0.6467947
##   0.0    0.7778  1.0357524  0.17561193  0.6470871
##   0.0    0.7879  1.0362012  0.17530523  0.6473749
##   0.0    0.7980  1.0366373  0.17501798  0.6476500
##   0.0    0.8081  1.0370644  0.17474459  0.6479195
##   0.0    0.8182  1.0374903  0.17446167  0.6481852
##   0.0    0.8283  1.0379104  0.17417622  0.6484428
##   0.0    0.8384  1.0383295  0.17389315  0.6486964
##   0.0    0.8485  1.0387372  0.17361110  0.6489448
##   0.0    0.8586  1.0391402  0.17332477  0.6491921
##   0.0    0.8687  1.0395373  0.17304424  0.6494311
##   0.0    0.8788  1.0399249  0.17277501  0.6496627
##   0.0    0.8889  1.0403122  0.17250197  0.6498933
##   0.0    0.8990  1.0406964  0.17221953  0.6501185
##   0.0    0.9091  1.0410775  0.17193136  0.6503380
##   0.0    0.9192  1.0414573  0.17164277  0.6505605
##   0.0    0.9293  1.0418293  0.17135438  0.6507958
##   0.0    0.9394  1.0421959  0.17106247  0.6510311
##   0.0    0.9495  1.0425598  0.17077102  0.6512615
##   0.0    0.9596  1.0429126  0.17049993  0.6514830
##   0.0    0.9697  1.0432505  0.17026094  0.6517078
##   0.0    0.9798  1.0435904  0.17001790  0.6519340
##   0.0    0.9899  1.0439227  0.16976846  0.6521492
##   0.0    1.0000  1.0442537  0.16952237  0.6523604
##   0.2    0.0001  1.0336730  0.21612264  0.7111679
##   0.2    0.0102  1.0186078  0.21795056  0.6897552
##   0.2    0.0203  1.0080302  0.21863964  0.6718719
##   0.2    0.0304  1.0026419  0.21685152  0.6583338
##   0.2    0.0405  0.9995774  0.21463337  0.6479468
##   0.2    0.0506  0.9979583  0.21260058  0.6413254
##   0.2    0.0607  0.9980461  0.21000512  0.6366600
##   0.2    0.0708  0.9990844  0.20730852  0.6338037
##   0.2    0.0809  1.0009854  0.20397237  0.6316061
##   0.2    0.0910  1.0033921  0.20032376  0.6308551
##   0.2    0.1011  1.0065939  0.19569910  0.6315614
##   0.2    0.1112  1.0100969  0.19123696  0.6329965
##   0.2    0.1213  1.0140017  0.18670042  0.6345328
##   0.2    0.1314  1.0185590  0.18135941  0.6363666
##   0.2    0.1415  1.0233401  0.17570151  0.6383238
##   0.2    0.1516  1.0279495  0.17009708  0.6400434
##   0.2    0.1617  1.0322509  0.16503407  0.6416691
##   0.2    0.1718  1.0366210  0.15924041  0.6433097
##   0.2    0.1819  1.0406220  0.15334940  0.6447879
##   0.2    0.1920  1.0444330  0.14708707  0.6462644
##   0.2    0.2021  1.0481220  0.14074311  0.6477317
##   0.2    0.2122  1.0516569  0.13460616  0.6492032
##   0.2    0.2223  1.0550872  0.12866323  0.6506230
##   0.2    0.2324  1.0584712  0.12276635  0.6520565
##   0.2    0.2425  1.0617190  0.11730979  0.6533729
##   0.2    0.2526  1.0648617  0.11267097  0.6546447
##   0.2    0.2627  1.0677765  0.10932484  0.6558806
##   0.2    0.2728  1.0703505  0.10770443  0.6571018
##   0.2    0.2829  1.0727696  0.10751158  0.6582565
##   0.2    0.2930  1.0749234  0.10841691  0.6592841
##   0.2    0.3031  1.0769487  0.10956591  0.6603288
##   0.2    0.3132  1.0789405  0.11102297  0.6613347
##   0.2    0.3233  1.0806092  0.11214698  0.6623566
##   0.2    0.3334  1.0821880  0.11321790  0.6633042
##   0.2    0.3435  1.0837561  0.11400465  0.6642850
##   0.2    0.3536  1.0850802  0.11377859  0.6652059
##   0.2    0.3637  1.0862227  0.11326321  0.6659557
##   0.2    0.3738  1.0874059  0.11263165  0.6669766
##   0.2    0.3839  1.0885721  0.11142520  0.6679864
##   0.2    0.3940  1.0896573  0.10946759  0.6689660
##   0.2    0.4041  1.0907258  0.10708718  0.6699706
##   0.2    0.4142  1.0918004  0.10436589  0.6710390
##   0.2    0.4243  1.0926619  0.09984769  0.6720024
##   0.2    0.4344  1.0933877  0.09437195  0.6728120
##   0.2    0.4445  1.0940718  0.08850745  0.6735740
##   0.2    0.4546  1.0946757  0.08261290  0.6743300
##   0.2    0.4647  1.0950347  0.07596252  0.6751368
##   0.2    0.4748  1.0952764  0.06948710  0.6758104
##   0.2    0.4849  1.0954162  0.06452865  0.6763809
##   0.2    0.4950  1.0954274  0.06220213  0.6769661
##   0.2    0.5051  1.0953970  0.06183126  0.6775124
##   0.2    0.5152  1.0954426  0.06191150  0.6780223
##   0.2    0.5253  1.0954729  0.06282015  0.6784775
##   0.2    0.5354  1.0953475  0.06509540  0.6788593
##   0.2    0.5455  1.0951805  0.06771724  0.6792193
##   0.2    0.5556  1.0951756  0.06893104  0.6795837
##   0.2    0.5657  1.0952898  0.06895257  0.6799326
##   0.2    0.5758  1.0953259  0.06911815  0.6802007
##   0.2    0.5859  1.0952966  0.06971122  0.6804134
##   0.2    0.5960  1.0952258  0.07057780  0.6805843
##   0.2    0.6061  1.0951822  0.07134295  0.6807446
##   0.2    0.6162  1.0952509  0.07073518  0.6809298
##   0.2    0.6263  1.0953034  0.07011953  0.6810946
##   0.2    0.6364  1.0953332  0.06957624  0.6812257
##   0.2    0.6465  1.0953540  0.06908073  0.6813542
##   0.2    0.6566  1.0953776  0.06859815  0.6814836
##   0.2    0.6667  1.0954044  0.06809304  0.6815981
##   0.2    0.6768  1.0954494  0.06740587  0.6817239
##   0.2    0.6869  1.0955262  0.06676396  0.6818547
##   0.2    0.6970  1.0956179  0.06610733  0.6820147
##   0.2    0.7071  1.0957096  0.06545746  0.6822117
##   0.2    0.7172  1.0957895  0.06486315  0.6823720
##   0.2    0.7273  1.0958744  0.06431631  0.6825257
##   0.2    0.7374  1.0959629  0.06377086  0.6826770
##   0.2    0.7475  1.0960537  0.06320663  0.6828527
##   0.2    0.7576  1.0961433  0.06258957  0.6830369
##   0.2    0.7677  1.0962221  0.06202452  0.6832147
##   0.2    0.7778  1.0962459  0.06173130  0.6833701
##   0.2    0.7879  1.0962505  0.06156606  0.6835160
##   0.2    0.7980  1.0962545  0.06143044  0.6836508
##   0.2    0.8081  1.0961587  0.03553934  0.6837548
##   0.2    0.8182  1.0960549  0.03538805  0.6838550
##   0.2    0.8283  1.0959120  0.03490059  0.6839511
##   0.2    0.8384  1.0957807  0.03442814  0.6840589
##   0.2    0.8485  1.0956283  0.03408941  0.6841597
##   0.2    0.8586  1.0954620  0.03397987  0.6842385
##   0.2    0.8687  1.0952855  0.03399817  0.6843087
##   0.2    0.8788  1.0950794  0.03410708  0.6843587
##   0.2    0.8889  1.0948708  0.03428161  0.6844051
##   0.2    0.8990  1.0946620  0.03453760  0.6844442
##   0.2    0.9091  1.0944367  0.03485115  0.6844393
##   0.2    0.9192  1.0942076  0.03525807  0.6844227
##   0.2    0.9293  1.0939414  0.03582397  0.6843824
##   0.2    0.9394  1.0936998  0.03619430  0.6843162
##   0.2    0.9495  1.0934560  0.03666459  0.6842467
##   0.2    0.9596  1.0931749  0.03729244  0.6841516
##   0.2    0.9697  1.0929015  0.03802116  0.6840491
##   0.2    0.9798  1.0926311  0.03885898  0.6839466
##   0.2    0.9899  1.0923880  0.03988811  0.6838734
##   0.2    1.0000  1.0921239  0.04049420  0.6837717
##  [ reached getOption("max.print") -- omitted 400 rows ]
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.1213.
reg.alpha <- regularized.model$results$alpha
reg.rmse <- regularized.model$results$RMSE
reg.lambda <- regularized.model$results$lambda

p.enet.cv <- data.frame(alpha=reg.param, RMSE=reg.rmse, lambda=reg.lambda) %>%
  mutate(alpha=as.character(alpha)) %>%
  ggplot(data=., mapping=aes(x=lambda, y=RMSE, color=alpha)) +
  geom_point() +
  geom_line(aes(group=alpha)) +
  theme_minimal() +
  ggtitle("Elastic Net Regression Cross-Validation Results") +
  theme(plot.title=element_text(hjust=0.5))
ggplotly(p.enet.cv)

What are the optimal values for alpha and lambda obtained via cross-validation?

regularized.model$bestTune
##    alpha lambda
## 13     0 0.1213

The 10-fold cross-validation yields an optimal alpha of 0, which indicates the model is full ridge regression (i.e. no lasso). Ridge regression employs L2 regularization, in which the penalty term includes the squared value of each coefficient. I will delve more into the relative importance of each variable and their corresponding coefficient in model evaluation.

First, I’ll evaluate baseline accuracy within the training data:

# Predict change in CDR Sum of Boxes value based on training data upon which ridge model was trained
train.pred.cdr <- predict(regularized.model, newdata=original.train)

# Compare with the actual change in CDR Sum of Boxes
train.real.cdr <- original.train$CDRSB

# Combine actual CDR vs. predicted CDR into a dataframe
ridge.results.train <- data.frame(Predicted=train.pred.cdr, Actual=train.real.cdr)

# Calculate RMSE and R-squared for training data
data.frame(RMSE = RMSE(train.pred.cdr, train.real.cdr),
           R2 = R2(train.pred.cdr, train.real.cdr))

rm(train.pred.cdr, train.real.cdr)
##        RMSE        R2
## 1 0.9011223 0.3945989

The \(R^2\) value of 0.3945989 within the training data is pretty low, and does not bode particularly well for model performance out-of-sample. I’ll calculate the same metrics for the out-of-sample test data predictions:

test.pred.cdr <- predict(regularized.model, newdata=original.test)
test.real.cdr <- original.test$CDRSB
ridge.results.test <- data.frame(Predicted=test.pred.cdr, Actual=test.real.cdr)

# Calculate RMSE and R-squared for test data
data.frame(RMSE = RMSE(test.pred.cdr, test.real.cdr),
           R2 = R2(test.pred.cdr, test.real.cdr))

rm(test.pred.cdr, test.real.cdr)
##       RMSE        R2
## 1 0.910047 0.0446316

The \(R^2\) value of 0.0446316 is very poor, suggesting this model does not hold outside of the training data. Interestingly, the RMSE is only marginally larger than that of the training data (0.9011223 vs. 0.910047). The predictions in training vs. test data can be visualized with scatter plots:

# Construct ggplot2-scatter plot
p.ridge.train <- ridge.results.train %>%
  ggplot(data=., mapping=aes(x=Actual, y=Predicted)) +
  theme_minimal() +
  geom_point(alpha=0.5, size=3, color="skyblue3", fill="skyblue3") +
  # Add regression line of best fit
  geom_smooth(stat="smooth", se=F, method="lm", color="black") +
  ylab("Training Predicted CDR-SoB Change") +
  xlab("Training Actual CDR-SoB Change") +
  theme(plot.title=element_text(hjust=0.5))
p.ridge.train <- ggplotly(p.ridge.train)

p.ridge.test <- ridge.results.test %>%
  ggplot(data=., mapping=aes(x=Actual, y=Predicted)) +
  theme_minimal() +
  geom_point(alpha=0.5, size=3, color="skyblue3", fill="skyblue3") +
  geom_smooth(stat="smooth", se=F, method="lm", color="black") +
  ggtitle("Ridge Regression CDR Sum of Boxes Predictions in Training vs. Test Data") +
  ylab("Test Predicted CDR-SoB Change") +
  xlab("Test Actual CDR-SoB Change") +
  theme(plot.title=element_text(hjust=0.5))
p.ridge.test <- ggplotly(p.ridge.test)

subplot(p.ridge.train, p.ridge.test, shareX=F,shareY=F,titleX=T,titleY=T, margin = 0.05) %>%
  layout(autosize = F, width = 800, height = 400)
rm(p.ridge.train, p.ridge.test)

The predicted change in CDR Sum of Boxes in the test data don’t align well with the actual values.

k-Nearest Neighbors (kNN)

set.seed(127)
# Train kNN model using the same train-control parameters as with the elastic net
# Namely, 10-fold cross validation
# tuneLength=20 --> try 20 different k-values
knn.fit <- train(CDRSB ~ ., data=original.train, method="knn", 
                 trControl=myControl, tuneLength=20, preProcces=c("center", "scale"))
knn.fit
## k-Nearest Neighbors 
## 
## 246 samples
##  33 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 222, 221, 222, 222, 221, 220, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared    MAE      
##    5  1.112073  0.05616964  0.6693705
##    7  1.102997  0.08091643  0.6538321
##    9  1.104795  0.06537287  0.6574191
##   11  1.108411  0.08281472  0.6722661
##   13  1.104827  0.06815602  0.6755339
##   15  1.091483  0.06820839  0.6599435
##   17  1.087658  0.05708111  0.6597232
##   19  1.091275  0.05683806  0.6620306
##   21  1.079554  0.06857051  0.6545352
##   23  1.081477  0.04637342  0.6607313
##   25  1.078622  0.05115349  0.6541105
##   27  1.072365  0.05638687  0.6514762
##   29  1.075722  0.05676474  0.6520285
##   31  1.082886  0.05032848  0.6619254
##   33  1.084298  0.04822441  0.6651348
##   35  1.086567  0.04501129  0.6663478
##   37  1.084689  0.06037048  0.6678348
##   39  1.085451  0.04771572  0.6697974
##   41  1.086916  0.04565273  0.6733584
##   43  1.088300  0.05139629  0.6770566
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 27.

These \(R^2\) values for the training data are not particularly promising, and the RMSE values are higher than those with the ridge regression. Here’s the RMSE distribution as a function of k nearest neighbors:

knn.k <- knn.fit$results$k
knn.rmse <- knn.fit$results$RMSE
knn.r2 <- knn.fit$results$Rsquared

p.knn.rmse <- data.frame(k=knn.k, RMSE=knn.rmse) %>%
  ggplot(data=., mapping=aes(x=k, y=RMSE)) +
  geom_point(color="skyblue4") +
  geom_line(color="skyblue4") +
  ylab("RMSE") +
  theme_minimal() +
  ggtitle("Cross-Validated kNN") +
  theme(plot.title=element_text(hjust=0.5))

p.knn.r2 <- data.frame(k=knn.k, R2=knn.r2) %>%
  ggplot(data=., mapping=aes(x=k, y=R2)) +
  geom_point(color="darkgreen") +
  geom_line(color="darkgreen") +
  ylab("R2") +
  xlab("k") +
  theme_minimal() +
  ggtitle("Cross-Validated kNN") +
  theme(plot.title=element_text(hjust=0.5))

p.knn.rmse <- ggplotly(p.knn.rmse)
p.knn.r2 <- ggplotly(p.knn.r2)

subplot(p.knn.rmse, p.knn.r2, nrows = 2, titleY=T, titleX=T)

The optimal k based on cross-validated RMSE is 27. I’ll further explore the performance of this kNN model within the training data:

# Predict change in CDR Sum of Boxes value based on training data upon which kNN model was trained
train.pred.cdr <- predict(knn.fit, newdata=original.train)

# Compare with the actual change in CDR Sum of Boxes
train.real.cdr <- original.train$CDRSB

# Combine actual CDR vs. predicted CDR into a dataframe
knn.results.train <- data.frame(Predicted=train.pred.cdr, Actual=train.real.cdr)

# Calculate RMSE and R-squared for training data
data.frame(RMSE = RMSE(train.pred.cdr, train.real.cdr),
           R2 = R2(train.pred.cdr, train.real.cdr))

rm(train.pred.cdr, train.real.cdr)
##       RMSE        R2
## 1 1.099331 0.0745427

This suggests a weak relationship between the predicted vs. actual CDR Sum of Boxes annual change within the training data. This is unlikely to improve out-of-sample, but it is worth checking:

test.pred.cdr <- predict(knn.fit, newdata=original.test)
test.real.cdr <- original.test$CDRSB
knn.results.test <- data.frame(Predicted=test.pred.cdr, Actual=test.real.cdr)

# Calculate RMSE and R-squared for test data
data.frame(RMSE = RMSE(test.pred.cdr, test.real.cdr),
           R2 = R2(test.pred.cdr, test.real.cdr))

rm(test.pred.cdr, test.real.cdr)
##        RMSE         R2
## 1 0.9105595 0.01210409

As expected based on the training data results, this model performs very poorly out-of-sample. As with ridge regression, I will visualize the scatter plot distribution:

# Construct ggplot2-scatter plot
p.knn.train <- knn.results.train %>%
  ggplot(data=., mapping=aes(x=Actual, y=Predicted)) +
  theme_minimal() +
  geom_point(alpha=0.5, size=3, color="skyblue3", fill="skyblue3") +
  # Add regression line of best fit
  geom_smooth(stat="smooth", se=F, method="lm", color="black") +
  ylab("Training Predicted CDR-SoB Change") +
  xlab("Training Actual CDR-SoB Change") +
  theme(plot.title=element_text(hjust=0.5))
p.knn.train <- ggplotly(p.knn.train)

p.knn.test <- knn.results.test %>%
  ggplot(data=., mapping=aes(x=Actual, y=Predicted)) +
  theme_minimal() +
  geom_point(alpha=0.5, size=3, color="skyblue3", fill="skyblue3") +
  geom_smooth(stat="smooth", se=F, method="lm", color="black") +
  ggtitle("kNN Regression CDR Sum of Boxes Predictions in Training vs. Test Data") +
  ylab("Test Predicted CDR-SoB Change") +
  xlab("Test Actual CDR-SoB Change") +
  theme(plot.title=element_text(hjust=0.5))
p.knn.test <- ggplotly(p.knn.test)

subplot(p.knn.train, p.knn.test, shareX=F,shareY=F,titleX=T,titleY=T, margin = 0.05) %>%
  layout(autosize = F, width = 800, height = 400)
rm(p.knn.train, p.knn.test)

The kNN-predicted CDR Sum of Boxes values are generally magnitude(s) smaller than the actual CDR Sum of Boxes change values, which is odd.

Neural net

For the neural network, I will min-max normalize the training data to conform to the range [0, 1] as per the sigmoid function.

# Function to min-max normalize a column
min_max <- function(col) {
  return( (col-min(col))/(max(col)-min(col)) )
}

# Create a copy of the original training dataset and min-max normalize it
original.train.nnet <- original.train
for (i in 1:ncol(original.train.nnet)) {
  original.train.nnet[,i] <- min_max(original.train.nnet[,i])
}
library(nnet)
# Train neural network model using the same train-control parameters as with the elastic net
# Namely, 10-fold cross validation
invisible(capture.output(nnet.fit <- train(CDRSB ~ ., 
                  data=original.train.nnet, 
                  method="nnet", 
                  trControl=myControl,
                  tuneGrid = expand.grid(size=c(1, 6, 12, 18, 24, 30, 33),
                                          decay = seq(0, 1, by=0.2)))))

The best neural network tuning parameters identified via 10-fold cross-validation are:

nnet.fit$bestTune
##    size decay
## 26   24   0.2

Click to view the full results of cross-validation for neural network fitting:

nnet.fit
## Neural Network 
## 
## 246 samples
##  33 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 221, 221, 222, 222, 221, 222, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE       Rsquared    MAE       
##    1    0.0    0.2256072  0.42498722  0.19674606
##    1    0.2    0.1024647  0.14869892  0.06694266
##    1    0.4    0.1028088  0.15728494  0.06865484
##    1    0.6    0.1032964  0.15948097  0.07035376
##    1    0.8    0.1038871  0.16039845  0.07207176
##    1    1.0    0.1045615  0.16085064  0.07388254
##    6    0.0    0.2002848  0.07692185  0.14336023
##    6    0.2    0.1023972  0.13513110  0.06616206
##    6    0.4    0.1024875  0.14755446  0.06704155
##    6    0.6    0.1026568  0.15163192  0.06793553
##    6    0.8    0.1028703  0.15364389  0.06882404
##    6    1.0    0.1031190  0.15481796  0.06970964
##   12    0.0    0.2172187  0.01702672  0.18538945
##   12    0.2    0.1023574  0.12546393  0.06588163
##   12    0.4    0.1023940  0.14153550  0.06646304
##   12    0.6    0.1024829  0.14724299  0.06705315
##   12    0.8    0.1025967  0.15017326  0.06764728
##   12    1.0    0.1027292  0.15194537  0.06824144
##   18    0.0    0.1346791  0.18396424  0.07632918
##   18    0.2    0.1023314  0.11711337  0.06573733
##   18    0.4    0.1023495  0.13627322  0.06616936
##   18    0.6    0.1024066  0.14337647  0.06661341
##   18    0.8    0.1024804  0.14712167  0.06705762
##   18    1.0    0.1025663  0.14944440  0.06750208
##   24    0.0    0.1982351  0.06268847  0.14773832
##   24    0.2    0.1023130  0.11011623  0.06564818
##   24    0.4    0.1023227  0.13140119  0.06599279
##   24    0.6    0.1023635  0.13975856  0.06634561
##   24    0.8    0.1024168  0.14424432  0.06670026
##   24    1.0    0.1024785  0.14703300  0.06705637
##   30    0.0          NaN         NaN         NaN
##   30    0.2          NaN         NaN         NaN
##   30    0.4          NaN         NaN         NaN
##   30    0.6          NaN         NaN         NaN
##   30    0.8          NaN         NaN         NaN
##   30    1.0          NaN         NaN         NaN
##   33    0.0          NaN         NaN         NaN
##   33    0.2          NaN         NaN         NaN
##   33    0.4          NaN         NaN         NaN
##   33    0.6          NaN         NaN         NaN
##   33    0.8          NaN         NaN         NaN
##   33    1.0          NaN         NaN         NaN
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 24 and decay = 0.2.

The final model is described as follows:

nnet.fit$finalModel
## a 33-24-1 network with 841 weights
## inputs: Age_Baseline amygdala bankssts caudalanteriorcingulate cuneus entorhinal fusiform hippocampus inferiorparietal inferiortemporal insula isthmuscingulate lateraloccipital lingual middlefr middletemporal orbitofr paracentral parahippocampal parsfr pericalcarine postcentral posteriorcingulate precentral precuneus rostralanteriorcingulate superiorfrontal superiorparietal superiortemporal supramarginal temporalpole transversetemporal Sex_Male 
## output(s): .outcome 
## options were - decay=0.2

The optimal neural network model includes a hidden layer comprised of 24 neurons, with a weight decay of 0.2. First, I will check performance within the training data:

n.neurons <- nnet.fit$results$size
nnet.rmse <- nnet.fit$results$RMSE
nnet.weight <- nnet.fit$results$decay

p.nnet.cv <- data.frame(size=n.neurons, RMSE=nnet.rmse, decay=nnet.weight) %>%
  mutate(decay=as.character(decay)) %>%
  ggplot(data=., mapping=aes(x=size, y=RMSE, color=decay)) +
  geom_point() +
  geom_line(aes(group=decay)) +
  labs(color="Weight Decay") +
  theme_minimal() +
  ggtitle("Neural Network Cross-Validation Results") +
  theme(plot.title=element_text(hjust=0.5))
ggplotly(p.nnet.cv)

There’s a striking difference in RMSE between weight=0 and all other weights. Zooming in on the five weights greater than 1 shows that the weight decay 0.2 consistently shows lower RMSE than models with the other weights.

Next, I’ll check model performance in the training dataset:

# Predict change in CDR Sum of Boxes value based on training data upon which neural network model was trained
train.pred.cdr <- predict(nnet.fit, newdata=original.train.nnet, type="raw")

# Compare with the actual change in CDR Sum of Boxes
train.real.cdr <- original.train.nnet$CDRSB

# Combine actual CDR vs. predicted CDR into a dataframe
nnet.results.train <- data.frame(Predicted=train.pred.cdr, Actual=train.real.cdr)

# Calculate RMSE and R-squared for training data
data.frame(RMSE = RMSE(train.pred.cdr, train.real.cdr),
           R2 = R2(train.pred.cdr, train.real.cdr))

rm(train.pred.cdr, train.real.cdr)
##        RMSE         R2
## 1 0.1084947 0.06147556

This \(R^2\) value of 0.06147944 is very low, though not quite as low as that obtained with kNN. This can be compared with out-of-sample performance with the test data – but first, the test data is also min-max normalized:

# Create a copy of the original test dataset and min-max normalize it
original.test.nnet <- original.test
for (i in 1:ncol(original.test.nnet)) {
  original.test.nnet[,i] <- min_max(original.test.nnet[,i])
}

test.pred.cdr <- predict(nnet.fit, newdata=original.test.nnet)
test.real.cdr <- original.test.nnet$CDRSB
nnet.results.test <- data.frame(Predicted=test.pred.cdr, Actual=test.real.cdr)

# Calculate RMSE and R-squared for test data
data.frame(RMSE = RMSE(test.pred.cdr, test.real.cdr),
           R2 = R2(test.pred.cdr, test.real.cdr))

rm(test.pred.cdr, test.real.cdr)
##       RMSE          R2
## 1 0.164727 0.009820704

This out-of-sample \(R^2\) is the worst of the three models, at 0.009821766. The training vs. testing data performance can be visualized with a scatter plot:

# Construct ggplot2-scatter plot
p.nnet.train <- nnet.results.train %>%
  ggplot(data=., mapping=aes(x=Actual, y=Predicted)) +
  theme_minimal() +
  geom_point(alpha=0.5, size=3, color="skyblue3", fill="skyblue3") +
  # Add regression line of best fit
  geom_smooth(stat="smooth", se=F, method="lm", color="black") +
  ylab("Training Predicted CDR-SoB Change") +
  xlab("Training Actual CDR-SoB Change") +
  theme(plot.title=element_text(hjust=0.5))
p.nnet.train <- ggplotly(p.nnet.train)

p.nnet.test <- nnet.results.test %>%
  ggplot(data=., mapping=aes(x=Actual, y=Predicted)) +
  theme_minimal() +
  geom_point(alpha=0.5, size=3, color="skyblue3", fill="skyblue3") +
  geom_smooth(stat="smooth", se=F, method="lm", color="black") +
  ggtitle("Neural Network CDR Sum of Boxes Predictions in Training vs. Test Data") +
  ylab("Test Predicted CDR-SoB Change") +
  xlab("Test Actual CDR-SoB Change") +
  theme(plot.title=element_text(hjust=0.5))
p.nnet.test <- ggplotly(p.nnet.test)

subplot(p.nnet.train, p.nnet.test, shareX=F,shareY=F,titleX=T,titleY=T, margin = 0.05) %>%
  layout(autosize = F, width = 800, height = 400)
rm(p.nnet.train, p.nnet.test)

Random Forest

Compare accuracy